# libraries
import PyPDF2
import pandas as pd
import nltk
nltk.download("punkt")
import re

import spacy
# # only for datalore 
import subprocess
# #%%|

print(subprocess.getoutput("python -m spacy download en_core_web_sm"))
nlp = spacy.load("en_core_web_sm")

import textacy
import summa
from summa import keywords

from snorkel.preprocess import preprocessor
from snorkel.types import DataPoint
from itertools import combinations
from snorkel.labeling import labeling_function
from snorkel.labeling import PandasLFApplier

import networkx as nx
from matplotlib import pyplot as plt
Collecting en-core-web-sm==3.7.1
Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 12.8/12.8 MB 37.7 MB/s eta 0:00:00
Requirement already satisfied: spacy<3.8.0,>=3.7.2 in /opt/python/envs/minimal/lib/python3.8/site-packages (from en-core-web-sm==3.7.1) (3.7.4)
Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.11 in /opt/python/envs/minimal/lib/python3.8/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (3.0.12)
Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /opt/python/envs/minimal/lib/python3.8/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (1.0.5)
Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /opt/python/envs/minimal/lib/python3.8/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (1.0.10)
Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /opt/python/envs/minimal/lib/python3.8/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.0.8)
Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /opt/python/envs/minimal/lib/python3.8/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (3.0.9)
Requirement already satisfied: thinc<8.3.0,>=8.2.2 in /opt/python/envs/minimal/lib/python3.8/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (8.2.3)
Requirement already satisfied: wasabi<1.2.0,>=0.9.1 in /opt/python/envs/minimal/lib/python3.8/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (1.1.2)
Requirement already satisfied: srsly<3.0.0,>=2.4.3 in /opt/python/envs/minimal/lib/python3.8/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.4.8)
Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /opt/python/envs/minimal/lib/python3.8/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.0.10)
Requirement already satisfied: weasel<0.4.0,>=0.1.0 in /opt/python/envs/minimal/lib/python3.8/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (0.3.4)
Requirement already satisfied: typer<0.10.0,>=0.3.0 in /opt/python/envs/minimal/lib/python3.8/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (0.9.0)
Requirement already satisfied: smart-open<7.0.0,>=5.2.1 in /opt/python/envs/minimal/lib/python3.8/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (6.4.0)
Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /opt/python/envs/minimal/lib/python3.8/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (4.66.2)
Requirement already satisfied: requests<3.0.0,>=2.13.0 in /opt/python/envs/minimal/lib/python3.8/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.31.0)
Requirement already satisfied: pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4 in /opt/python/envs/minimal/lib/python3.8/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.6.4)
Requirement already satisfied: jinja2 in /opt/python/envs/minimal/lib/python3.8/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (3.1.2)
Requirement already satisfied: setuptools in /opt/python/envs/minimal/lib/python3.8/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (56.0.0)
Requirement already satisfied: packaging>=20.0 in /opt/python/envs/minimal/lib/python3.8/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (23.1)
Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /opt/python/envs/minimal/lib/python3.8/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (3.3.0)
Requirement already satisfied: numpy>=1.15.0 in /opt/python/envs/minimal/lib/python3.8/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (1.24.3)
Requirement already satisfied: annotated-types>=0.4.0 in /opt/python/envs/minimal/lib/python3.8/site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (0.6.0)
Requirement already satisfied: pydantic-core==2.16.3 in /opt/python/envs/minimal/lib/python3.8/site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.16.3)
Requirement already satisfied: typing-extensions>=4.6.1 in /opt/python/envs/minimal/lib/python3.8/site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (4.10.0)
Requirement already satisfied: charset-normalizer<4,>=2 in /opt/python/envs/minimal/lib/python3.8/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (3.2.0)
Requirement already satisfied: idna<4,>=2.5 in /opt/python/envs/minimal/lib/python3.8/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (3.4)
Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/python/envs/minimal/lib/python3.8/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (1.26.16)
Requirement already satisfied: certifi>=2017.4.17 in /opt/python/envs/minimal/lib/python3.8/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2023.7.22)
Requirement already satisfied: blis<0.8.0,>=0.7.8 in /opt/python/envs/minimal/lib/python3.8/site-packages (from thinc<8.3.0,>=8.2.2->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (0.7.11)
Requirement already satisfied: confection<1.0.0,>=0.0.1 in /opt/python/envs/minimal/lib/python3.8/site-packages (from thinc<8.3.0,>=8.2.2->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (0.1.4)
Requirement already satisfied: click<9.0.0,>=7.1.1 in /opt/python/envs/minimal/lib/python3.8/site-packages (from typer<0.10.0,>=0.3.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (8.1.7)
Requirement already satisfied: cloudpathlib<0.17.0,>=0.7.0 in /opt/python/envs/minimal/lib/python3.8/site-packages (from weasel<0.4.0,>=0.1.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (0.16.0)
Requirement already satisfied: MarkupSafe>=2.0 in /opt/python/envs/minimal/lib/python3.8/site-packages (from jinja2->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.1.3)
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.7.1

[notice] A new release of pip is available: 23.1.2 -> 24.0
[notice] To update, run: pip install --upgrade pip
✔ Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')
[nltk_data] Downloading package punkt to /home/datalore/nltk_data...
[nltk_data] Unzipping tokenizers/punkt.zip.

Import Text

# import data
# creating a pdf file object
pdfFileObj = open('The_Shadow_Over_Innsmouth.pdf''rb')

# creating a pdf reader object
pdfReader = PyPDF2.PdfReader(pdfFileObj)
# how many pages
print(len(pdfReader.pages))
# creating a page object
pageObj = pdfReader.pages

text = []
for page in pageObj:
  text.append(page.extract_text())

# closing the pdf file object
pdfFileObj.close()
# text
# we can see that it added the page num in the begiing of each page. 
# we will remove them in next step:
219

Convert to Sentences and Pandas

# Put together into a giant text
book = " ".join(text)
# create a place to save the text
saved_words = []

# loop over each word
for word in nltk.word_tokenize(book):
    # if the word starts with a number and ends with a letter
    if (re.search(r'^[0-9].*[a-zA-Z]$', word) != "None"): 
        # take out the numbers and save into our text
        saved_words.append(re.sub(r'[0-9]''', word))
    # if not then save just the word 
    else:
        saved_words.append(word)

# word tokanized
book = ' '.join(saved_words)
type(book)
str
# Check here:

# saved_words
# book = " ".join(text)
# print(book)

Create a dataframe

DF = pd.DataFrame(
    nltk.sent_tokenize(book),
    columns = ["sentences"]
)

DF.head(10)

# for IE, we want sentence and/or paragraph level structure 

Part of Speech Tagging

  • Tag your data with spacy’s part of speech tagger.
  • Convert this data into a Pandas DataFrame.
# easier to loop over the big text file than loop over words AND rows in pandas 
spacy_pos_tagged = [(str(word), word.tag_, word.pos_) for word in nlp(book)]
# each row represents one token 
DF_POS = pd.DataFrame(
    spacy_pos_tagged,
    columns = ["token""specific_tag""upos"]
)
DF_POS.head(20)
  • Use the dataframe to calculate the most common parts of speech.
DF_POS['upos'].value_counts()
# punct is not pos. so most common Nouns and verbs
#  x are thins it could not figure it out

Use the dataframe to calculate if words are considered more than one part of speech (crosstabs or groupby).

-What is the most common part of speech? The nost common parts of speach are : Noun and Verbs -Do you see words that are multiple parts of speech?

DF_POS2 = pd.crosstab(DF_POS['token'], DF_POS['upos'])

# convert to true false to add up how many times not zero 
DF_POS2['total'] = DF_POS2.astype(bool).sum(axis=1)
#print out the rows that aren't 1 
DF_POS2[DF_POS2['total'] > 1]

Answer:

There are 517 tokens that are multiple part of speech. (using crosstabs)

KPE

Use textacy to find the key phrases in your text.

  • in the r window for r people
  • library(reticulate)
  • py_install("networkx < 3.0", pip = T)
# textacy KPE
# build an english language for textacy pipe
en = textacy.load_spacy_lang("en_core_web_sm"disable=("parser"))

# build a processor for textacy using spacy and process text
doc = textacy.make_spacy_doc(book, lang = en)

# text rank algorithm  
print([kps for kps, weights in textacy.extract.keyterms.textrank(doc, normalize = "lemma",  topn = 5)])

terms = set([term for term, weight in textacy.extract.keyterms.textrank(doc)])
print(textacy.extract.utils.aggregate_term_variants(terms))
['in', 'enwikisource Jmj Steinsplitter Leonel Sohns Rocket Dbenbenn Zscout Jacobolus Indolences Technion Dha Abigor Reisio Blurpeace Dschwen ToBeFree Boris KABALINI Bromskloss Tene ~ commonswiki AzaT oth Bender PatríciaR', 'like thing Obed brung', 'old man Marsh', 'old Captain Obed Marsh']
[{'enwikisource Jmj Steinsplitter Leonel Sohns Rocket Dbenbenn Zscout Jacobolus Indolences Technion Dha Abigor Reisio Blurpeace Dschwen ToBeFree Boris KABALINI Bromskloss Tene ~ commonswiki AzaT oth Bender PatríciaR'}, {'Pasicles NBRBH Tbthatcher blindguynw John V andenber g Pathoschild Jarama CalendulaAsteraceae Xover Boja Pathosbot Nonexyst Cneubauer Danny'}, {'fine old residence street'}, {'old Captain Obed Marsh'}, {'like thing Obed brung'}, {'watery eyed old man'}, {'old man Marsh'}, {'old Innsmouth'}, {'old folk'}, {'in'}]
TR_keywords = keywords.keywords(book, scores = True)
print(TR_keywords[0:10])
[('o', 0.23146800097563744), ('things', 0.16344413847160408), ('thing', 0.16344413847160408), ('streets', 0.1524065302609831), ('street', 0.1524065302609831), ('looks', 0.14318073656131747), ('look', 0.14318073656131747), ('looking', 0.14318073656131747), ('looked', 0.14318073656131747), ('innsmouth', 0.12145676453793346)]
print(TR_keywords)
[('o', 0.23146800097563744), ('things', 0.16344413847160408), ('thing', 0.16344413847160408), ('streets', 0.1524065302609831), ('street', 0.1524065302609831), ('looks', 0.14318073656131747), ('look', 0.14318073656131747), ('looking', 0.14318073656131747), ('looked', 0.14318073656131747), ('innsmouth', 0.12145676453793346), ('d', 0.11736307228162549), ('folk', 0.1122252400844271), ('folks', 0.1122252400844271), ('like', 0.11150063487716365), ('likely', 0.11150063487716365), ('likes', 0.11150063487716365), ('liked', 0.11150063487716365), ('old story', 0.1061869945830989), ('obed', 0.1015687632177302), ('open', 0.09603899325193906), ('opened', 0.09603899325193906), ('opening', 0.09603899325193906), ('opener', 0.09603899325193906), ('openly', 0.09603899325193906), ('em', 0.09340997475284482), ('fishing', 0.09337612163416342), ('fish', 0.09337612163416342), ('fishes', 0.09337612163416342), ('certainly', 0.08466906631930936), ('people', 0.07885262889613755), ('time', 0.07815491484291391), ('times', 0.07815491484291391), ('aout', 0.07607415066500255), ('houses', 0.07592836705162326), ('house', 0.07592836705162326), ('hous', 0.07592836705162326), ('abaout', 0.07527876833629249), ('long', 0.074309519470541), ('queerly', 0.07325950328847518), ('ancient', 0.0718396847509192), ('marshes', 0.07119510128909552), ('marsh', 0.07119510128909552), ('water', 0.07074334692050642), ('waters', 0.07074334692050642), ('sounded', 0.07053140039685735), ('sound', 0.07053140039685735), ('sounds', 0.07053140039685735), ('black', 0.06922855864129969), ('blackness', 0.06922855864129969), ('blackly', 0.06922855864129969), ('e', 0.06647090460774861), ('line', 0.06582084150781244), ('lined', 0.06582084150781244), ('strange', 0.06530824497752227), ('strangeness', 0.06530824497752227), ('strangely', 0.06530824497752227), ('strang', 0.06530824497752227), ('queer kind', 0.06492038201286143), ('ye', 0.06422663553912718), ('fer', 0.06338091179820589), ('came', 0.06231271002589934), ('y', 0.06139234476525847), ('change', 0.06027325688367115), ('changes', 0.06027325688367115), ('chang', 0.06027325688367115), ('half', 0.06020918509332096), ('n thought', 0.05885923078740612), ('years', 0.058733722103621905), ('signs', 0.05777553641378831), ('sign', 0.05777553641378831), ('signed', 0.05777553641378831), ('shape', 0.05697176006822429), ('shapes', 0.05697176006822429), ('kinds', 0.056581260737247695), ('humans', 0.05622041327576899), ('humanly', 0.05622041327576899), ('seas', 0.05594131216202816), ('shambled', 0.055924334166541856), ('shambling', 0.055924334166541856), ('shamble', 0.055924334166541856), ('ud', 0.0555809374392699), ('roofs', 0.05521370060234117), ('roof', 0.05521370060234117), ('sea islander', 0.05505145435027858), ('deserted', 0.054475865463949565), ('desertion', 0.054475865463949565), ('shadow', 0.05417211856290571), ('shadowed', 0.05417211856290571), ('shadows', 0.05417211856290571), ('island', 0.05416159653852899), ('islanders', 0.05416159653852899), ('islands', 0.05416159653852899), ('eyes', 0.05407589961174481), ('eye', 0.05407589961174481), ('ahead', 0.0537032969218217), ('certain conditions', 0.053553621670930154), ('faces eyed', 0.05311261662245481), ('began', 0.05296705269912494), ('nearly', 0.05242514942255793), ('face', 0.05214933363316482), ('faced', 0.05214933363316482), ('facing', 0.05214933363316482), ('grown', 0.051770664693321505), ('hey', 0.05146777185468628), ('men', 0.05105133422186897), ('little', 0.050652852415510104), ('crossed', 0.05055362759859681), ('cross', 0.05055362759859681), ('crossing', 0.05055362759859681), ('crosses', 0.05055362759859681), ('seen', 0.0498380474259164), ('brick', 0.0495589674911066), ('past', 0.04938184089679488), ('possibly', 0.04920312360550861), ('possible', 0.04920312360550861), ('window', 0.04912732968748531), ('windows', 0.04912732968748531), ('soon', 0.048097504820287275), ('horror', 0.04801278932901411), ('horrors', 0.04801278932901411), ('great', 0.047768897346798816), ('dark', 0.04775849208191164), ('darkness', 0.04775849208191164), ('darkly', 0.04775849208191164), ('place near', 0.047439655312215355), ('shewed', 0.04733086620646477), ('shewe', 0.04733086620646477), ('shew', 0.04733086620646477), ('shewing', 0.04733086620646477), ('far', 0.04683769931894315), ('road', 0.046752597030192036), ('roads', 0.046752597030192036), ('door', 0.0467327987259007), ('doors', 0.0467327987259007), ('churches', 0.04671092767828272), ('church', 0.04671092767828272), ('clear', 0.04594606051840739), ('clearly', 0.04594606051840739), ('clearness', 0.04594606051840739), ('way', 0.04570281752592846), ('ways', 0.04570281752592846), ('later', 0.04562672734574253), ('lateral', 0.04562672734574253), ('staring', 0.04484028531272084), ('stared', 0.04484028531272084), ('stare', 0.04484028531272084), ('stone', 0.04464656175269412), ('stones', 0.04464656175269412), ('vague', 0.04457622604332517), ('vaguely', 0.04457622604332517), ('odd', 0.04447271122294179), ('oddly', 0.04447271122294179), ('formed', 0.04400207822320252), ('forms', 0.04400207822320252), ('form', 0.04400207822320252), ('forming', 0.04400207822320252), ('curiously', 0.043918142787188995), ('curious', 0.043918142787188995), ('creakings', 0.04375420800337308), ('creak', 0.04375420800337308), ('creaked', 0.04375420800337308), ('towns', 0.04368623178586662), ('town', 0.04368623178586662), ('glancing', 0.04300055064654736), ('glance', 0.04300055064654736), ('glanced', 0.04300055064654736), ('visible', 0.042973626185744906), ('visibly', 0.042973626185744906), ('visibility', 0.042973626185744906), ('furtive', 0.04263724470593068), ('furtively', 0.04263724470593068), ('furtiveness', 0.04263724470593068), ('places', 0.04245416120187278), ('placed', 0.04245416120187278), ('buildings', 0.04181654937376793), ('away impressions', 0.041647022469030445), ('obscured lines', 0.04097880009374606), ('high', 0.0408349983361498), ('highly', 0.0408349983361498), ('took', 0.04078892672979426), ('bus', 0.04066216423336001), ('yellow', 0.0403992700608469), ('reef', 0.0403496301780848), ('gods', 0.040276390931760116), ('god', 0.040276390931760116), ('life', 0.040198898031645246), ('square', 0.039962200449568795), ('squares', 0.039962200449568795), ('squar', 0.039962200449568795), ('despite', 0.039924984039020554), ('closed building', 0.03960186201159754), ('got outside', 0.03944169660349382), ('night', 0.039414834164545456), ('connection', 0.03928207608788116), ('connected', 0.03928207608788116), ('connecting', 0.03928207608788116), ('live', 0.03926801065562827), ('living', 0.03926801065562827), ('lived', 0.03926801065562827), ('lives', 0.03926801065562827), ('good', 0.03918912555409997), ('ruins', 0.03886139658642097), ('ruin', 0.03886139658642097), ('ruined', 0.03886139658642097), ('ve changed lately', 0.03842729612141973), ('nt', 0.038161935213575526), ('zadok', 0.038063326548705215), ('faint', 0.03803847345242196), ('faintly', 0.03803847345242196), ('fainting', 0.03803847345242196), ('voices', 0.037997667299607824), ('voice', 0.037997667299607824), ('human bein', 0.03795408995571382), ('said', 0.037936366601283106), ('grew', 0.03773901290010956), ('cover', 0.0376925350403416), ('covered', 0.0376925350403416), ('covering', 0.0376925350403416), ('started', 0.03763703412352251), ('start', 0.03763703412352251), ('starting', 0.03763703412352251), ('starts', 0.03763703412352251), ('closely', 0.037387174649427146), ('closing', 0.037387174649427146), ('h', 0.03730892712012533), ('th year', 0.03718982290179586), ('local man', 0.037160930133838754), ('r', 0.03713236987459057), ('malignity', 0.0369478418579677), ('malignancy', 0.0369478418579677), ('malign', 0.0369478418579677), ('malignant', 0.0369478418579677), ('g', 0.036764878195141404), ('evil', 0.03671933787390332), ('right', 0.03662433442070221), ('hours', 0.03660606396634085), ('hour', 0.03660606396634085), ('coming', 0.036498010624052744), ('come', 0.036498010624052744), ('comes', 0.036498010624052744), ('copyright', 0.03634980963185349), ('copyrighted', 0.03634980963185349), ('largely', 0.03625787468163887), ('new', 0.036094592753776765), ('fears', 0.036094328660012956), ('feared', 0.036094328660012956), ('fear', 0.036094328660012956), ('fearing', 0.036094328660012956), ('tell', 0.03608714590486151), ('light', 0.03608055452501991), ('wild', 0.036064816994221614), ('wildly', 0.036064816994221614), ('late', 0.03593401918620346), ('lateness', 0.03593401918620346), ('rooms', 0.035589781786207024), ('room', 0.035589781786207024), ('abnormalities', 0.03534819109035835), ('abnormal', 0.03534819109035835), ('low', 0.03512933482770558), ('railway', 0.03499471033222441), ('rying', 0.03498271884113838), ('ry', 0.03498271884113838), ('day', 0.03498237292204742), ('record', 0.034963776410854776), ('recorded', 0.034963776410854776), ('kept', 0.03491162312614871), ('whispered', 0.034744282545712386), ('whisper', 0.034744282545712386), ('whispering', 0.034744282545712386), ('whispers', 0.034744282545712386), ('whisperers', 0.034744282545712386), ('cutting', 0.03462961919775728), ('cuttings', 0.03462961919775728), ('secret', 0.03450129453818628), ('secretiveness', 0.03450129453818628), ('secrets', 0.03450129453818628), ('impression', 0.03444259268050877), ('gold refinery', 0.034313800348604824), ('deep', 0.03428625524569291), ('north', 0.033863678809727134), ('youth', 0.03358998803383291), ('arkham', 0.033257366916056755), ('south', 0.03318772346739606), ('abandoning', 0.033086707926183824), ('evidently impressed', 0.03262906394512451), ('heard', 0.03251136575686567), ('abnormally large force', 0.03239598349561486), ('fellow', 0.032354966184241496), ('head', 0.03221261229519266), ('headed', 0.03221261229519266), ('glimpsed', 0.0321141055539316), ('glimpses', 0.0321141055539316), ('glimpse', 0.0321141055539316), ('library', 0.03203714739240227), ('dead', 0.032025090098433115), ('sheer', 0.03197568341615722), ('native', 0.03173912616409202), ('natives', 0.03173912616409202), ('white', 0.03165772791613387), ('whitely', 0.03165772791613387), ('orne', 0.03161376724702635), ('thar', 0.03131296228527557), ('probably', 0.031286631941134534), ('probable', 0.031286631941134534), ('ed', 0.031037966581312023), ('age', 0.031024646384772418), ('aged', 0.031024646384772418), ('ages', 0.031024646384772418), ('object', 0.031020931863632074), ('objects', 0.031020931863632074), ('objective', 0.031020931863632074), ('shipping days', 0.031015651234320846), ('river', 0.030951776986075832), ('region', 0.03093173224799051), ('regions', 0.03093173224799051), ('regional', 0.03093173224799051), ('evidence', 0.030815535209740258), ('moonlit nights', 0.03063956973658111), ('noted', 0.030514380747541862), ('note', 0.030514380747541862), ('notes', 0.030514380747541862), ('noting', 0.030514380747541862), ('blue', 0.030482718256779673), ('drop daown', 0.030468837333045694), ('pass', 0.030404011925348053), ('passing', 0.030404011925348053), ('passed', 0.030404011925348053), ('talked', 0.030368900294093816), ('talk', 0.030368900294093816), ('talking', 0.030368900294093816), ('talks', 0.030368900294093816), ('public', 0.03023931794643703), ('publications', 0.03023931794643703), ('publicly', 0.03023931794643703), ('creaking flights', 0.030194302248542297), ('uncle', 0.03012781296972719), ('nervous glances', 0.030104644781710384), ('know', 0.02996212893744773), ('knows', 0.02996212893744773), ('knowing', 0.02996212893744773), ('knowed', 0.02996212893744773), ('wide', 0.029856015226044464), ('git', 0.02984813788417336), ('gits', 0.02984813788417336), ('hideous', 0.02982443438663099), ('hideously', 0.02982443438663099), ('hideousness', 0.02982443438663099), ('increased', 0.02981856432243614), ('increasingly', 0.02981856432243614), ('increase', 0.02981856432243614), ('increasing', 0.02981856432243614), ('turn', 0.029707850655630134), ('turned', 0.029707850655630134), ('turning', 0.029707850655630134), ('sudden', 0.029624028714285724), ('suddenly', 0.029624028714285724), ('suddenness', 0.029624028714285724), ('told', 0.029541342441122904), ('cud', 0.029412280790131227), ('fishy', 0.029404557561125293), ('hard', 0.02934037935992827), ('hardly', 0.02934037935992827), ('factory', 0.029324216013510103), ('factories', 0.029324216013510103), ('stories', 0.029279179101755266), ('nothin', 0.02914249636065371), ('coat', 0.029004452513909002), ('outsiders', 0.02898968854963859), ('space', 0.02890393156401177), ('spaces', 0.02890393156401177), ('big', 0.028860064277910838), ('blood', 0.028831893499950648), ('bloods', 0.028831893499950648), ('badly cut', 0.02878403651391981), ('heh', 0.02865809076645316), ('doorway', 0.028593338061439284), ('doorways', 0.028593338061439284), ('ty close', 0.028567072350719612), ('electric lights', 0.028547506025499818), ('harbour', 0.02820649255447534), ('harboured', 0.02820649255447534), ('saw', 0.02816231828401787), ('flash', 0.028124996999447482), ('flashing', 0.028124996999447482), ('flashes', 0.028124996999447482), ('flashed', 0.028124996999447482), ('pirate', 0.02811834416801913), ('pirates', 0.02811834416801913), ('decay', 0.028109527858257398), ('decayed', 0.028109527858257398), ('decaying', 0.028109527858257398), ('leaning', 0.028106995255932278), ('leaned', 0.028106995255932278), ('lean', 0.028106995255932278), ('somewhat youthful', 0.028059834135275172), ('real', 0.02796428795498296), ('run', 0.027889918362315084), ('running', 0.027889918362315084), ('runs', 0.027889918362315084), ('crumbling', 0.027852991126998104), ('bridges', 0.02772898764359063), ('legends', 0.027646479104918), ('legend', 0.027646479104918), ('l', 0.02762974549846509), ('lower', 0.027617545617351604), ('al', 0.02759624194643023), ('als', 0.02759624194643023), ('left', 0.027477734459508192), ('mind', 0.027470893706506752), ('minding', 0.027470893706506752), ('abandoned waterfront', 0.027387903449844476), ('actually', 0.027315878885310625), ('actual', 0.027315878885310625), ('actuality', 0.027315878885310625), ('especially', 0.027268156418855338), ('mad', 0.027227945340799344), ('madness', 0.027227945340799344), ('renewals records', 0.027184047790274286), ('agin', 0.02716877967848764), ('monstrous', 0.0271110265202378), ('monstrously', 0.0271110265202378), ('ships', 0.02704892954659427), ('shipped', 0.02704892954659427), ('ship', 0.02704892954659427), ('words', 0.026915799766721572), ('hall', 0.02685305816683948), ('spaced gleams', 0.026659250178738162), ('station', 0.026637566624509577), ('jest', 0.026513257444592993), ('mere telling helps', 0.02642838885293397), ('main', 0.026324492719424), ('mainly', 0.026324492719424), ('traded', 0.02631584108274885), ('trade', 0.02631584108274885), ('amidst', 0.026115682385175785), ('narrow heads', 0.026080694295696018), ('silent', 0.02602005197625365), ('silently', 0.02602005197625365), ('tall', 0.025970257074222507), ('second', 0.025860064394785488), ('small', 0.02583909745501999), ('obviously', 0.025836896956937777), ('obvious', 0.025836896956937777), ('general', 0.025780953783168567), ('generally', 0.025780953783168567), ('afore', 0.025755693218018117), ('croakings', 0.025732600872497978), ('croak', 0.025732600872497978), ('croaking', 0.025732600872497978), ('peculiarly', 0.02571378309500359), ('peculiar', 0.02571378309500359), ('peculiarities', 0.02571378309500359), ('older fellows', 0.02566056995301301), ('hoarse', 0.025654248342378064), ('leading', 0.025619873135995674), ('lead', 0.025619873135995674), ('wal', 0.025598359077620626), ('forces', 0.025581884714847358), ('forced', 0.025581884714847358), ('forcing', 0.025581884714847358), ('naow', 0.025566710576617243), ('blasphemous abnormality', 0.02556576698776649), ('watery', 0.025425395104352728), ('feel', 0.025401213373167416), ('scarcely', 0.025336159380814894), ('source', 0.02533453275346392), ('sources', 0.02533453275346392), ('making', 0.02527737653292117), ('makes', 0.02527737653292117), ('make', 0.02527737653292117), ('designs', 0.025229037997134246), ('designed', 0.025229037997134246), ('design', 0.025229037997134246), ('young', 0.025197954808360654), ('leaves', 0.025130246426057407), ('leaving', 0.025130246426057407), ('leave', 0.025130246426057407), ('haow', 0.025089419844877798), ('final', 0.025056960171558843), ('reache', 0.02504472669186799), ('reach', 0.02504472669186799), ('reached', 0.02504472669186799), ('reaching', 0.02504472669186799), ('washington', 0.025038419980895635), ('ancestral', 0.02490825431541724), ('ess', 0.024908254315416992), ('included', 0.024868095877526475), ('including', 0.024868095877526475), ('say', 0.02484312237195561), ('says', 0.02484312237195561), ('decrepit', 0.02472300853680305), ('kin', 0.024648988851688862), ('son', 0.02464612024095183), ('sons', 0.02464612024095183), ('help', 0.024632679801491877), ('le', 0.024581489821251867), ('green', 0.024511807835365673), ('tiara', 0.02445024779615059), ('tiaraed', 0.02445024779615059), ('tiaras', 0.02445024779615059), ('violently', 0.02442683978521787), ('violent', 0.02442683978521787), ('gleaming', 0.024414568793464556), ('gleam', 0.024414568793464556), ('gleamed', 0.024414568793464556), ('till', 0.024408382146207667), ('strangers', 0.02433924851132553), ('choked', 0.024301426841456492), ('red', 0.02422776389115001), ('finally emer', 0.02419270125254921), ('course', 0.024185902098768964), ('coursing', 0.024185902098768964), ('shut', 0.024165797102846123), ('heavy', 0.02412536435762123), ('single word', 0.024080875750974375), ('hear', 0.023932963939050735), ('hearing', 0.023932963939050735), ('nature', 0.023668922051203353), ('natural', 0.023668922051203353), ('naturally', 0.023668922051203353), ('devil', 0.023665390004738142), ('devils', 0.023665390004738142), ('safe', 0.02364613244507791), ('safely', 0.02364613244507791), ('kanakys', 0.023598631738709484), ('kanaky', 0.023598631738709484), ('felt immediate', 0.023570969585888647), ('vast', 0.02351360923802834), ('university', 0.023509776824398285), ('universal', 0.023509776824398285), ('states', 0.023462556545735615), ('state', 0.023462556545735615), ('stately', 0.023462556545735615), ('outsid er', 0.023444114697578847), ('creature', 0.023341461981417348), ('creatures', 0.023341461981417348), ('noise', 0.02322592312126928), ('noises', 0.02322592312126928), ('awful', 0.023223484923420672), ('better', 0.0232108759227269), ('ones', 0.02303906171633816), ('imagination', 0.022995275032909542), ('imagine', 0.022995275032909542), ('imaginative', 0.022995275032909542), ('imagined', 0.022995275032909542), ('fresh', 0.022953547096559532), ('bad', 0.022938453830082334), ('steeples', 0.022828032628185353), ('steeple', 0.022828032628185353), ('sets', 0.022810976997749224), ('set', 0.022810976997749224), ('southward', 0.022802095733765508), ('books', 0.022775821014264362), ('book', 0.022775821014264362), ('disease', 0.022760308399840827), ('thanks', 0.02275655075419053), ('thankfully', 0.02275655075419053), ('thank', 0.02275655075419053), ('guttural', 0.022661536227704705), ('figure', 0.02264152112252198), ('figures', 0.02264152112252198), ('kilt', 0.022631261653141034), ('rattled', 0.02258170523851556), ('walking', 0.02255329075930372), ('walked', 0.02255329075930372), ('walks', 0.02255329075930372), ('walk', 0.02255329075930372), ('seed', 0.02249878359490145), ('disturbing', 0.022463320636159002), ('disturbed', 0.022463320636159002), ('condition', 0.022438177022550944), ('conditioned', 0.022438177022550944), ('view', 0.022368423998702317), ('viewing', 0.022368423998702317), ('reasons', 0.022353440825208955), ('reason', 0.022353440825208955), ('reasonably', 0.022353440825208955), ('inside', 0.02221519508098743), ('present', 0.02220726130149366), ('presently', 0.02220726130149366), ('presentable', 0.02220726130149366), ('noticed', 0.022200318504783653), ('notices', 0.022200318504783653), ('noticing', 0.022200318504783653), ('softly', 0.022151502052113116), ('soft', 0.022151502052113116), ('gilman', 0.022142648322375367), ('gilmans', 0.022142648322375367), ('sight', 0.022118906417349215), ('drew', 0.02210002019790189), ('ordered', 0.022057133637400857), ('order', 0.022057133637400857), ('unpleasantly', 0.02203669808437001), ('ing', 0.02188726099613438), ('historical', 0.021843747661822784), ('historic', 0.021843747661822784), ('iron highway bridge', 0.021715162466286447), ('held', 0.021644663789761355), ('moonlight', 0.021625937120072783), ('uneven', 0.02160081495660298), ('namelessly', 0.021594042608710708), ('nameless', 0.021594042608710708), ('grotesqueness', 0.021565382294764575), ('grotesque', 0.021565382294764575), ('repulsion', 0.021542430168525195), ('repulsive', 0.021542430168525195), ('repulsiveness', 0.021542430168525195), ('crouching', 0.02152719572888127), ('crouched', 0.02152719572888127), ('crouch', 0.02152719572888127), ('foreigners seldom settled', 0.02146707175602976), ('feller', 0.02138590568785571), ('fellers', 0.02138590568785571), ('distance', 0.021343774873910527), ('ge', 0.02128928563651027), ('called', 0.02122416636946236), ('motor', 0.02116007042212661), ('followed', 0.021136763398338514), ('following', 0.021136763398338514), ('early', 0.021128499880507683), ('monsters', 0.02107644147601349), ('everybody trades', 0.0210639519263066), ('electrically', 0.021014457525979722), ('world', 0.02099652516715387), ('worldly', 0.02099652516715387), ('bolt', 0.02092810317662359), ('bolts', 0.02092810317662359), ('bolted', 0.02092810317662359), ('bolting', 0.02092810317662359), ('wife', 0.020922037617430715), ('v', 0.020912634384098416), ('simply', 0.02090066268477388), ('knock', 0.020898788709726467), ('knocking', 0.020898788709726467), ('knocked', 0.020898788709726467), ('foreign', 0.020888679389610195), ('foreigner', 0.020888679389610195), ('gaping', 0.020862899680386623), ('special', 0.020847625055289004), ('specially', 0.020847625055289004), ('suggestion', 0.02084068039114557), ('suggested', 0.02084068039114557), ('shock', 0.020825940754445046), ('shocking', 0.020825940754445046), ('shocked', 0.020825940754445046), ('battered', 0.020820421136174445), ('battering', 0.020820421136174445), ('port', 0.02076975071659029), ('ports', 0.02076975071659029), ('porte', 0.02076975071659029), ('waxed', 0.02076300665619485), ('feeling slightly', 0.020741956514963883), ('cthulhu fhtagn', 0.020722182308093334), ('civic', 0.02068556230054198), ('floors', 0.020614022013281975), ('floor', 0.020614022013281975), ('gets', 0.02055150417977961), ('getting', 0.02055150417977961), ('odour imaginable', 0.020549723437823145), ('plainly', 0.020470548581350265), ('plain', 0.020470548581350265), ('whar', 0.02037275814282848), ('frequent', 0.020363912302952446), ('frequently', 0.020363912302952446), ('instead', 0.02031954674688701), ('gave', 0.02023819285590574), ('shoulder', 0.020237681443092924), ('frightful', 0.02011115806180203), ('fright', 0.02011115806180203), ('frightfulness', 0.02011115806180203), ('work', 0.020092024833263958), ('works', 0.020092024833263958), ('working', 0.020092024833263958), ('worked', 0.020092024833263958), ('walls', 0.02005969234221157), ('wall', 0.02005969234221157), ('insane', 0.020011920518713484), ('stark', 0.02001047634671691), ('map', 0.019944676547882217), ('musty', 0.01992296413659347), ('mustiness', 0.01992296413659347), ('warren', 0.01990616659839235), ('warrens', 0.01990616659839235), ('sharp', 0.019812609949164003), ('ll notice', 0.019808989926742224), ('free', 0.019743998123414284), ('nightmare', 0.019715559826902436), ('wholly', 0.019711147522110964), ('reasonable jumping', 0.0197109156986119), ('sargent', 0.019683407295466533), ('quickly', 0.019641636981572525), ('quick', 0.019641636981572525), ('unpleasant hints', 0.01953265054246711), ('stretching inland', 0.019519250525109513), ('cautious rattling', 0.01948083736734419), ('fortune', 0.019480482112473142), ('fortunate', 0.019480482112473142), ('fortunately', 0.019480482112473142), ('gently', 0.019471960643494178), ('gentle', 0.019471960643494178), ('parson', 0.019469472602141505), ('parsons', 0.019469472602141505), ('use', 0.019460342711615498), ('useful', 0.019460342711615498), ('dense', 0.019456283985078395), ('arter', 0.019425442968323665), ('tradition', 0.019413915673450147), ('traditions', 0.019413915673450147), ('dreams', 0.019404724497836024), ('dream', 0.019404724497836024), ('renewed', 0.0194043191696938), ('renewal', 0.0194043191696938), ('renew', 0.0194043191696938), ('claw', 0.019385535406281778), ('directly', 0.019362591686104956), ('direction', 0.019362591686104956), ('directions', 0.019362591686104956), ('speech', 0.019341172439845657), ('crazy', 0.01929551954542408), ('moon', 0.019243379370730986), ('ominous', 0.019235144027704328), ('ominously', 0.019235144027704328), ('children', 0.019192594483389783), ('lly noticeable', 0.019128671984942495), ('distant', 0.01911619319817697), ('distantly', 0.01911619319817697), ('hands', 0.019110137963449667), ('hand', 0.019110137963449667), ('handed', 0.019110137963449667), ('worm', 0.019043337689913422), ('wormed', 0.019043337689913422), ('stretch', 0.01897663412912777), ('stretches', 0.01897663412912777), ('stretched', 0.01897663412912777), ('dropped', 0.018973041653126903), ('reel', 0.01896191502927897), ('reeling', 0.01896191502927897), ('reeled', 0.01896191502927897), ('horrible', 0.01895627498860474), ('dizzy', 0.01894334933983085), ('dizzying', 0.01894334933983085), ('velvet', 0.018943349339830697), ('astonishing', 0.018943349339830697), ('governess', 0.018943349339830624), ('brig', 0.018943349339830603), ('malay', 0.01894334933983057), ('trapping', 0.018943349339830558), ('trappings', 0.018943349339830558), ('sunken', 0.018943349339830544), ('allusion', 0.01894334933983053), ('allus', 0.01894334933983053), ('sumatry', 0.018943349339830516), ('cape', 0.018943349339830502), ('unpaved', 0.018943349339830478), ('ss', 0.018943349339830478), ('image', 0.01894334933983046), ('images', 0.01894334933983046), ('imag', 0.01894334933983046), ('raised', 0.01894334933983045), ('raise', 0.01894334933983045), ('permanent', 0.01894334933983041), ('rope', 0.01894334933983035), ('elliptical', 0.018943349339830145), ('major', 0.018923979574211113), ('fit', 0.018866455317085373), ('fitted', 0.018866455317085373), ('scattered', 0.01884438759715222), ('try', 0.01884208241671667), ('tries', 0.01884208241671667), ('tried', 0.01884208241671667), ('trying', 0.01884208241671667), ('thoughts', 0.01879633691476053), ('offered', 0.018763197921017823), ('offers', 0.018763197921017823), ('offer', 0.018763197921017823), ('alien', 0.018721891534749464), ('alienation', 0.018721891534749464), ('marshy', 0.018696227971691282), ('fashion', 0.01863511932695446), ('incredible', 0.018591687473765914), ('remaining', 0.018577917804007398), ('remains', 0.018577917804007398), ('remained', 0.018577917804007398), ('remain', 0.018577917804007398), ('dismal', 0.018563971399806728), ('went', 0.01855448968151597), ('blocks', 0.018534205679655864), ('block', 0.018534205679655864), ('blocked', 0.018534205679655864), ('hev', 0.018513442282198648), ('marvelous', 0.018504703727419858), ('marvels', 0.018504703727419858), ('marvel', 0.018504703727419858), ('wind', 0.018503524186151577), ('flopping', 0.018477950209879444), ('flopped', 0.018477950209879444), ('miss', 0.01847100319790686), ('missing', 0.01847100319790686), ('common maps', 0.018416262208484555), ('war', 0.01839259181385516), ('wars', 0.01839259181385516), ('swimming', 0.018392398875308816), ('swim', 0.018392398875308816), ('land', 0.018388317774864448), ('landed', 0.018388317774864448), ('anybody', 0.018377678396907446), ('earth', 0.01832197468023302), ('cursed', 0.018310494952295843), ('curse', 0.018310494952295843), ('walakea', 0.018243487367822466), ('ef', 0.018197224957872772), ('weak curses', 0.018197148152591693), ('government', 0.018151970530847043), ('activity', 0.018091067625760994), ('activities', 0.018091067625760994), ('weakness', 0.018083801352887542), ('greyish', 0.01807696126625106), ('ph', 0.01799026879975863), ('scale', 0.01797941325053991), ('let', 0.017971535865822286), ('stop', 0.017945217313641584), ('stopped', 0.017945217313641584), ('stopping', 0.017945217313641584), ('grounds', 0.017928384794494504), ('ground', 0.017928384794494504), ('bringing', 0.01791816929300962), ('bring', 0.01791816929300962), ('commercial', 0.01791626391313929), ('save', 0.01781950511724007), ('knick', 0.017794792508623123), ('ged', 0.017785926775401636), ('phantasy', 0.017774066757021537), ('shuttered', 0.017752986826919444), ('shutters', 0.017752986826919444), ('shutter', 0.017752986826919444), ('pious', 0.017707186979883056), ('marine abyss', 0.017700260725753158), ('poor', 0.01768486706865011), ('smoke', 0.01767219659470959), ('smoking', 0.01767219659470959), ('brooding', 0.01766764263392822), ('brood', 0.01766764263392822), ('hotel', 0.017607577541909322), ('center', 0.017603274196904765), ('sinister', 0.017571265830154493), ('gen', 0.01756670496192208), ('ols', 0.01755933737266238), ('ol', 0.01755933737266238), ('smell', 0.017492063142995805), ('offen', 0.01746614018200753), ('proved', 0.01745437221484936), ('proves', 0.01745437221484936), ('prove', 0.01745437221484936), ('evilly owed', 0.017439561340780126), ('unconsciously', 0.017434342112274508), ('unconscious', 0.017434342112274508), ('curiosity', 0.01741734922181923), ('antiquarian', 0.017417263536075672), ('shortly', 0.017412657954831205), ('short', 0.017412657954831205), ('abysses', 0.017399543409188936), ('rags', 0.017389549888090675), ('ragged', 0.017389549888090675), ('mansions', 0.01738424613305841), ('mansion', 0.01738424613305841), ('sarten', 0.017346769892717124), ('city', 0.01731559791448302), ('cities', 0.01731559791448302), ('somethin', 0.017289599586264026), ('departed', 0.017260925021348364), ('fairly', 0.017231463141050906), ('fair', 0.017231463141050906), ('nervously', 0.017208738916873408), ('nervousness', 0.017208738916873408), ('matt', 0.017177273222436375), ('robed', 0.017171821193486928), ('robes', 0.017171821193486928), ('clerk merely', 0.017152360362323843), ('beard', 0.0171321459641413), ('east', 0.017121477421011912), ('bygone', 0.017120446416804706), ('cap', 0.017120161633074508), ('jump', 0.017068390572014848), ('sized', 0.0170665687436176), ('size', 0.0170665687436176), ('chance', 0.017059802774125832), ('chances', 0.017059802774125832), ('unaccountable', 0.017053432677012224), ('unaccountably', 0.017053432677012224), ('hinted', 0.017028603000564215), ('hinting', 0.017028603000564215), ('hint', 0.017028603000564215), ('chapters', 0.016978292709534597), ('chapter', 0.016978292709534597), ('waited', 0.016945944752744947), ('waiting', 0.016945944752744947), ('waites', 0.016945944752744947), ('waite', 0.016945944752744947), ('wait', 0.016945944752744947), ('wondered', 0.016890924298686882), ('wonder', 0.016890924298686882), ('wondering', 0.016890924298686882), ('wonders', 0.016890924298686882), ('commons', 0.016887847869086894), ('extreme', 0.016873674086211047), ('extremity', 0.016873674086211047), ('extremes', 0.016873674086211047), ('existed', 0.01685075580269347), ('exist', 0.01685075580269347), ('revived existence', 0.016850755802693466), ('revivals', 0.016850755802693462), ('pastor', 0.016850755802693317), ('dr', 0.01685075580269315), ('recent guideboo', 0.016850755802693025), ('shots', 0.016830306415975018), ('shot', 0.016830306415975018), ('ai', 0.016829166998701323), ('shall', 0.016813661412367707), ('fine', 0.016784503214487188), ('finely', 0.016784503214487188), ('stood', 0.01676686268563067), ('digital', 0.016700130271443982), ('fancied', 0.01668751059989713), ('fancy', 0.01668751059989713), ('exotic', 0.01668565974528151), ('exotically', 0.01668565974528151), ('rectangle', 0.01666338541947941), ('home', 0.0166601690131742), ('shade', 0.01665944592626364), ('flight', 0.01663439649371151), ('deposited', 0.01659562931491415), ('deposit', 0.01659562931491415), ('dare', 0.01658733602697104), ('dared', 0.01658733602697104), ('bestial', 0.016573544821446413), ('du', 0.016548443088539), ('loose', 0.01654047871962618), ('loosing', 0.01654047871962618), ('broad', 0.01653140661558841), ('steady', 0.01651374374423635), ('fantastic', 0.016480589547142988), ('answer', 0.016426233414398386), ('answered', 0.016426233414398386), ('answering', 0.016426233414398386), ('iä', 0.016414052654099866), ('cautiously', 0.016379969496172823), ('extravagant', 0.016363946573016874), ('extravagance', 0.016363946573016874), ('lookin', 0.016297965277002405), ('mounting', 0.01629321620318897), ('mount', 0.01629321620318897), ('clothes', 0.016282468881132956), ('moved', 0.016234402408945494), ('moving', 0.016234402408945494), ('solitary boy', 0.016229218839465945), ('planned', 0.01620275390542538), ('plans', 0.01620275390542538), ('plan', 0.01620275390542538), ('planning', 0.01620275390542538), ('m', 0.01618795933178724), ('step', 0.016186405419051874), ('steps', 0.016186405419051874), ('known', 0.016186281809170047), ('nations', 0.016180738841559992), ('nation', 0.016180738841559992), ('national', 0.016180738841559992), ('obscure', 0.01613675867967968), ('family', 0.01612716474332872), ('families', 0.01612716474332872), ('slight', 0.01608269965676035), ('sidewalk', 0.016080759228326875), ('sidewalks', 0.016080759228326875), ('commerce', 0.016075643176510786), ('according', 0.016062851326076474), ('accordingly', 0.016062851326076474), ('accordance', 0.016062851326076474), ('visage', 0.01597925052099044), ('measures', 0.015947075115250654), ('measurably', 0.015947075115250654), ('pace', 0.01591435739478427), ('frantically', 0.015903426288273965), ('frantic', 0.015903426288273965), ('grassy', 0.01589247436186001), ('owing', 0.01584814888402552), ('taown', 0.01583572768516208), ('hez', 0.01578172096016697), ('creeks', 0.015780231818174423), ('creek', 0.015780231818174423), ('cupolas', 0.015776157377437813), ('cupola', 0.015776157377437813), ('curtained', 0.015746643224197374), ('curtains', 0.015746643224197374), ('power', 0.01573694620891054), ('powers', 0.01573694620891054), ('driver', 0.015734914546080465), ('vestments', 0.01571996087207576), ('williamson', 0.015680494599327244), ('upper', 0.01567196090335512), ('sorts', 0.015668653488045536), ('sort', 0.015668653488045536), ('climb', 0.015648421366326704), ('climbing', 0.015648421366326704), ('climbed', 0.015648421366326704), ('occasional', 0.015626098167487914), ('occasionally', 0.015626098167487914), ('consciously', 0.015618972501123988), ('conscious', 0.015618972501123988), ('stay', 0.01561134505690504), ('stayed', 0.01561134505690504), ('definite', 0.015609677134853322), ('definitely', 0.015609677134853322), ('length', 0.015607985113760733), ('features', 0.015552024088626496)]

Questions

  • What differences do you see in their outputs? COMMENT ON HOW SLOW!
  • Using textacy utilities, combine like key phrases. SEE ABOVE
  • Do the outputs make sense given your text? ANSWER THIS QUESTION

Answers:

  1. the time to run for summa was 3 m 26 s and for Textacy 1.6 s. that's a huge diffderence.
  2. It did not work very well. because it pulls up main characters name like "Obed Marsh" but it returns many nunsensce things as well.
  3. it returns many nunsensce things

NER + Snorkel

  • Use spacy to extract named entities.
  • Create a summary of your named entities.
# easier to loop over the big text file than loop over words AND rows in pandas 
spacy_ner_tagged = [(str(word.text), word.label_) for word in nlp(book).ents]

# each row represents one token 
DF_NER = pd.DataFrame(
    spacy_ner_tagged,
    columns = ["token""entity"]
)
print(DF_NER['entity'].value_counts())

DF_NER2 = pd.crosstab(DF_NER['token'], DF_NER['entity'])
print(DF_NER2)

# convert to true false to add up how many times not zero 
DF_NER2['total'] = DF_NER2.astype(bool).sum(axis=1)
#print out the rows that aren't 1 
DF_NER2[DF_NER2['total'] > 1]
PERSON 403
ORG 383
CARDINAL 204
GPE 200
DATE 144
FAC 90
TIME 74
NORP 70
LOC 66
ORDINAL 59
PRODUCT 20
EVENT 20
WORK_OF_ART 15
QUANTITY 4
LANGUAGE 4
MONEY 2
LAW 1
Name: entity, dtype: int64
entity CARDINAL DATE EVENT FAC GPE LANGUAGE LAW LOC MONEY \
token
' night 0 0 0 0 0 0 0 0 0
'seventy-eight 0 2 0 0 0 0 0 0 0
About four hours 0 0 0 0 0 0 0 0 0
Adam 0 0 0 0 0 0 0 0 0
Adams Street 0 0 0 2 0 0 0 0 0
... ... ... ... ... ... ... ... ... ...
weedy 0 0 0 0 0 0 0 0 0
year 0 2 0 0 0 0 0 0 0
years 0 11 0 0 0 0 0 0 0
years ago 0 4 0 0 0 0 0 0 0
years before 0 2 0 0 0 0 0 0 0

entity NORP ORDINAL ORG PERSON PRODUCT QUANTITY TIME \
token
' night 0 0 0 0 0 0 2
'seventy-eight 0 0 0 0 0 0 0
About four hours 0 0 0 0 0 0 2
Adam 0 0 0 2 0 0 0
Adams Street 0 0 0 0 0 0 0
... ... ... ... ... ... ... ...
weedy 0 0 0 2 0 0 0
year 0 0 0 0 0 0 0
years 0 0 0 0 0 0 0
years ago 0 0 0 0 0 0 0
years before 0 0 0 0 0 0 0

entity WORK_OF_ART
token
' night 0
'seventy-eight 0
About four hours 0
Adam 0
Adams Street 0
... ...
weedy 0
year 0
years 0
years ago 0
years before 0

[430 rows x 17 columns]

we got some words that are taged as more than one entity. this is not a dictionary lookup. this is not a one to one. this is a learning system that use the text but it may or may not be good.

Apply Snorkel to your data to show any relationship between names.

get the data into a good format

we use this code to store all the entities

stored_entities = []

# first get the entities, must be two for relationship matches
def get_entities(x):
  """
  Grabs the names using spacy's entity labeler
  """
  # get all the entities in this row 
  processed = nlp(x)
  # get the tokens for each sentence
  tokens = [word.text for word in processed]
  # get all the entities - notice this is only for persons 
  temp = [(str(ent), ent.label_) for ent in processed.ents if ent.label_ != ""]
  # only move on if this row has at least two
  if len(temp) > 1
    # finds all the combinations of pairs 
    temp2 = list(combinations(temp, 2))
    # for each pair combination 
    for (person1, person2) in temp2:
      # find the names in the person 1
      person1_words = [word.text for word in nlp(person1[0])]
      # find the token numbers for person 1
      person1_ids = [i for i, val in enumerate(tokens) if val in person1_words]
      # output in (start, stop) token tuple format 
      if len(person1_words) > 1:
        person1_ids2 = tuple(idx for idx in person1_ids[0:2])
      else:
        id_1 = [idx for idx in person1_ids]
        person1_ids2 = (id_1[0], id_1[0])
        
      # do the same thing with person 2
      person2_words = [word.text for word in nlp(person2[0])]
      person2_ids = [i for i, val in enumerate(tokens) if val in person2_words[0:2]]
      if len(person2_words) > 1:
        person2_ids2 = tuple(idx for idx in person2_ids[0:2])
      else:
        id_2 = [idx for idx in person2_ids[0:2]]
        person2_ids2 = (id_2[0], id_2[0])  
      
      # store all this in a list 
      stored_entities.append(
        [x, # original text
        tokens, # tokens
        person1[0], # person 1 name
        person2[0], # person 2 name
        person1_ids2, # person 1 id token tuple 
        person2_ids2 # person 2 id token tuple
        ])

DF['sentences'].apply(get_entities)

# create dataframe in snorkel structure 
DF_dev = pd.DataFrame(stored_entities, columns = ["sentence""tokens""person1""person2""person1_word_idx""person2_word_idx"])

DF_dev.head(10)

This dataframe has the sentences and it tokens.The first 2 entities (person 1 and person 2).and the tuples that gets get the first and end of instatnces of that particular phrase.

some of these dependencies are meaningless or useless

figure out where to look (between and to the left)

# live locate home road roads in at street (locations tied together)
# family terms for people 

# get words between the data points 
@preprocessor()
def get_text_between(cand: DataPoint) -> DataPoint:
    """
    Returns the text between the two person mentions in the sentence
    """
    start = cand.person1_word_idx[1] + 1
    end = cand.person2_word_idx[0]
    cand.between_tokens = cand.tokens[start:end]
    return cand

# get words next to the data points
@preprocessor()
def get_left_tokens(cand: DataPoint) -> DataPoint:
    """
    Returns tokens in the length 3 window to the left of the person mentions
    """
    TODO: need to pass window as input params
    window = 5

    end = cand.person1_word_idx[0]
    cand.person1_left_tokens = cand.tokens[0:end][-1 - window : -1]

    end = cand.person2_word_idx[0]
    cand.person2_left_tokens = cand.tokens[0:end][-1 - window : -1]
    return cand

figure out what to look for

# live locate home road roads in at street (locations tied together)
# family terms for people 

found_location = 1
found_family = 1
ABSTAIN = 0

location = {"live""living""locate""located""home""road""roads""street""streets""in""at""of"}

@labeling_function(resources=dict(location=location), pre=[get_text_between])
def between_location(x, location):
    return found_location if len(location.intersection(set(x.between_tokens))) > 0 else ABSTAIN

@labeling_function(resources=dict(location=location), pre=[get_left_tokens])
def left_location(x, location):
    if len(set(location).intersection(set(x.person1_left_tokens))) > 0:
        return found_location
    elif len(set(location).intersection(set(x.person2_left_tokens))) > 0:
        return found_location
    else:
        return ABSTAIN

family = {"spouse""wife""husband""ex-wife""ex-husband""marry"
          "married""father""mother""sister""brother""son""daughter"
          "grandfather""grandmother""uncle""aunt""cousin"
          "boyfriend""girlfriend"}

@labeling_function(resources=dict(family=family), pre=[get_text_between])
def between_family(x, family):
    return found_family if len(family.intersection(set(x.between_tokens))) > 0 else ABSTAIN

@labeling_function(resources=dict(family=family), pre=[get_left_tokens])
def left_family(x, family):
    if len(set(family).intersection(set(x.person1_left_tokens))) > 0:
        return found_family
    elif len(set(family).intersection(set(x.person2_left_tokens))) > 0:
        return found_family
    else:
        return ABSTAIN

# create a list of functions to run     
lfs = [
    between_location,
    left_location,
    between_family,
    left_family
]
# build the applier function 
applier = PandasLFApplier(lfs)
# run it on the dataset 
L_dev = applier.apply(DF_dev)
100%|██████████| 1206/1206 [00:01<00:00, 843.50it/s]
L_dev
DF_combined = pd.concat([DF_dev, pd.DataFrame(L_dev, columns = ["location1""location2""family1""family2"])], axis = 1)
DF_combined.head()

DF_combined['location_yes'] = DF_combined['location1'] + DF_combined["location2"]
DF_combined['family_yes'] = DF_combined['family1'] + DF_combined["family2"]

print(DF_combined['location_yes'].value_counts())
print(DF_combined['family_yes'].value_counts())
0 513
1 445
2 248
Name: location_yes, dtype: int64
0 1102
1 68
2 36
Name: family_yes, dtype: int64

What might you do to improve the default NER extraction?

  • WE had entities like"Obed marsh" and "March" , .. athat rrefered to the same person and many other entities taht refere to the same entities but in different words. We should do "Named Entity Disambiguation" (NED). After getting the name/keyphrases, Then we have to figuere out what thoes things refer to in real world by putting together NER and NED and do named Entity Linking(NEL).
  • We did POS tagging, NER and KPE. WE need parsing. and coreference solution to put together all the same entities. and link a database of definitions of people/places/things. like per for using trained data like Azure and API/ or free options like DBpedia.

Knowledge Graphs

Slides Version

  • Based on the chosen text, add entities to a default spacy model.
  • Add a norm_entity, merge_entity, and init_coref pipelines.
  • Update and add the alias lookup if necessary for the data.
  • Add the name resolver pipeline.

Or Use Your Snorkel Output

  • Create a co-occurrence graph of the entities linked together in your text.

we need a dataframe containe entity 1 and entity 2 and count s of how many times they occure together. and the locations and family

# locations only
DF_loc = DF_combined[DF_combined['location_yes'] > 0]
DF_loc = DF_loc[['person1''person2']].reset_index(drop = True)

cooc_loc = DF_loc.groupby(by=["person1""person2"], as_index=False).size()

# family only
DF_fam = DF_combined[DF_combined['family_yes'] > 0]
DF_fam = DF_fam[['person1''person2']].reset_index(drop = True)

cooc_fam = DF_fam.groupby(by=["person1""person2"], as_index=False).size()

# take out issues where entity 1 == entity 2
cooc_loc = cooc_loc[cooc_loc['person1'] != cooc_loc['person2']]
cooc_fam = cooc_fam[cooc_fam['person1'] != cooc_fam['person2']]

print(cooc_loc.head())
print(cooc_fam.head())
person1 person2 size
0 Adams — Geor 2
1 Adams — Main Street 2
2 Africa Asia 2
3 Africa the South Seas 2
4 Anna Tilton the hour 2
person1 person2 size
1 Arkham James Williamson 2
2 Arkham Obed Marsh 2
3 Arkham Ohio 2
4 Arkham Zadok 2
5 B. Lapham Peabody James Williamson 2

This creates a dataframe of node 1 and then node 2 (entity 1 to entity 2) and then frequency (size)

# start by plotting the whole thing for location 
cooc_loc_small = cooc_loc[cooc_loc['size']>1]
graph = nx.from_pandas_edgelist(
           cooc_loc_small[['person1''person2''size']] \
           .rename(columns={'size''weight'}),
           source='person1'target='person2'edge_attr=True)

pos = nx.kamada_kawai_layout(graph, weight='weight')

_ = plt.figure(figsize=(2020))
nx.draw(graph, pos, 
        node_size=1000
        node_color='skyblue',
        alpha=0.8,
        with_labels = True)
plt.title('Graph Visualization'size=15)

for (node1,node2,data) in graph.edges(data=True):
    width = data['weight'
    _ = nx.draw_networkx_edges(graph,pos,
                               edgelist=[(node1, node2)],
                               width=width,
                               edge_color='#505050',
                               alpha=0.5)

plt.show()
plt.close()
  • This creates a dataframe of node 1 and then node 2 (entity 1 to entity 2) and then frequency (size)
cooc_fam
graph = nx.from_pandas_edgelist(
           cooc_fam[['person1''person2''size']] \
           .rename(columns={'size''weight'}),
           source='person1'target='person2'edge_attr=True)

pos = nx.kamada_kawai_layout(graph, weight='weight')

_ = plt.figure(figsize=(2020))
nx.draw(graph, pos, 
        node_size=1000
        node_color='skyblue',
        alpha=0.8,
        with_labels = True)
plt.title('Graph Visualization'size=15)

for (node1,node2,data) in graph.edges(data=True):
    width = data['weight'
    _ = nx.draw_networkx_edges(graph,pos,
                               edgelist=[(node1, node2)],
                               width=width,
                               edge_color='#505050',
                               alpha=0.5)

plt.show()
plt.close()